YOLO -V2.0 Implementation

  1. Implementated from scratch using tensorflow
  2. Tested for COCO dataset
In [1]:
#!pip install tensorflow==1.12.0
In [2]:
# Initialization
colab_run=False
anc_box= True
_grid_offset=True
train=False
test_mode=2
data_save=True
root='D:/'
train_dir='COCO/train2017/train2017'
val_dir='COCO/val2017/val2017/'
test_dir='COCO/test2017/test2017/'
if colab_run==True:
  !pip install pydrive
  from pydrive.auth import GoogleAuth
  from pydrive.drive import GoogleDrive
  from google.colab import auth
  from oauth2client.client import GoogleCredentials

  auth.authenticate_user()
  gauth = GoogleAuth()
  gauth.credentials = GoogleCredentials.get_application_default()
  drive = GoogleDrive(gauth)

  import os, cv2
  from google.colab import drive
  drive.mount('/content/drive/')

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Reshape, Activation, Conv2D, Input, MaxPooling2D, BatchNormalization, Flatten, Dense, Lambda
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
from tensorflow.keras.layers import concatenate
from tensorflow.keras.callbacks import ModelCheckpoint
from matplotlib import pyplot as plt
import tensorflow.keras.backend as K
import tensorflow as tf
import numpy as np
import pickle
import os, cv2

#os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
#os.environ["CUDA_VISIBLE_DEVICES"] = ""

# %matplotlib inline
C:\Users\irfan\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\framework\dtypes.py:523: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
C:\Users\irfan\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\framework\dtypes.py:524: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
C:\Users\irfan\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\framework\dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
C:\Users\irfan\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\framework\dtypes.py:526: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
C:\Users\irfan\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\framework\dtypes.py:527: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
C:\Users\irfan\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\framework\dtypes.py:532: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  np_resource = np.dtype([("resource", np.ubyte, 1)])

Configure Block

Model Building Block

In [3]:
LABELS=['person', 'bird', 'cat', 'cow', 'dog', 'horse', 'sheep','aeroplane', 'bicycle',
         'boat', 'bus', 'car', 'motorbike', 'train', 'bottle', 'chair','diningtable',
         'pottedplant', 'sofa', 'tvmonitor']
f=open('coco.txt','r')
LABELS=[]
for line in f:
    line=line.split(':')
    LABELS.append(str(line[1][1:-1]))
f=open('D:/COCO/labels_coco','rb')
_LABELS=pickle.load(f)
f.close()
LABELS=list(_LABELS.values())

IMAGE_H, IMAGE_W = 608, 608
GRID_H,  GRID_W  = 19 , 19
BOX              = 5
CLASS            = len(LABELS)
CLASS_WEIGHTS    = np.ones(CLASS, dtype='float32')
OBJ_THRESHOLD    = 0.3#0.5
NMS_THRESHOLD    = 0.3#0.45
ANCHORS          = [0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828]
#ANCHORS          = [1.3221, 1.73145, 3.19275, 4.00944, 5.05587, 8.09892, 9.47112, 4.84053, 11.2364, 10.0071]           
#ANCHORS          = [ 4.469053,2.148582,10.548851,5.381520,11.420664,9.961033,6.517299,3.699693,2.469196,1.599054]
#ANCHORS=[13,13]
#NO_OBJECT_SCALE  = 1.0
#OBJECT_SCALE     = 5.0
#COORD_SCALE      = 1.0
#CLASS_SCALE      = 1.0

BATCH_SIZE       = 10
WARM_UP_BATCHES  = 0
TRUE_BOX_BUFFER  = 50
lambda_coord=5.0
lambda_noobj=0.5

lr=1e-5
epochs=60
optimizer = Adam(lr=lr, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
#optimizer = SGD(lr=1e-4, decay=0.0005, momentum=0.9)
#optimizer = RMSprop(lr=1e-4, rho=0.9, epsilon=1e-08, decay=0.0)
exp_name='v-2.1/'
root='D:/'
path_wts=root+'yolov2.weights'
path_wts_final=root+exp_name+'best.hdf5'
if not os.path.exists(root+exp_name):
    os.mkdir(root+exp_name)
In [4]:
# the function to implement the orgnization layer (thanks to github.com/allanzelener/YAD2K)
def space_to_depth_x2(x):
    return tf.space_to_depth(x, block_size=2)

def build_model():
  input_image = Input(shape=(IMAGE_H, IMAGE_W, 3))
  true_boxes  = Input(shape=(1, 1, 1, TRUE_BOX_BUFFER , 4))

  # Layer 1
  x = Conv2D(32, (3,3), strides=(1,1), padding='same', name='conv_1', use_bias=False)(input_image)
    
  x = BatchNormalization(name='norm_1')(x)
  x = LeakyReLU(alpha=0.1)(x)
  x = MaxPooling2D(pool_size=(2, 2))(x)

  # Layer 2
  x = Conv2D(64, (3,3), strides=(1,1), padding='same', name='conv_2', use_bias=False)(x)
  x = BatchNormalization(name='norm_2')(x)
  x = LeakyReLU(alpha=0.1)(x)
  x = MaxPooling2D(pool_size=(2, 2))(x)

  # Layer 3
  x = Conv2D(128, (3,3), strides=(1,1), padding='same', name='conv_3', use_bias=False)(x)
  x = BatchNormalization(name='norm_3')(x)
  x = LeakyReLU(alpha=0.1)(x)

  # Layer 4
  x = Conv2D(64, (1,1), strides=(1,1), padding='same', name='conv_4', use_bias=False)(x)
  x = BatchNormalization(name='norm_4')(x)
  x = LeakyReLU(alpha=0.1)(x)

  # Layer 5
  x = Conv2D(128, (3,3), strides=(1,1), padding='same', name='conv_5', use_bias=False)(x)
  x = BatchNormalization(name='norm_5')(x)
  x = LeakyReLU(alpha=0.1)(x)
  x = MaxPooling2D(pool_size=(2, 2))(x)

  # Layer 6
  x = Conv2D(256, (3,3), strides=(1,1), padding='same', name='conv_6', use_bias=False)(x)
  x = BatchNormalization(name='norm_6')(x)
  x = LeakyReLU(alpha=0.1)(x)

  # Layer 7
  x = Conv2D(128, (1,1), strides=(1,1), padding='same', name='conv_7', use_bias=False)(x)
  x = BatchNormalization(name='norm_7')(x)
  x = LeakyReLU(alpha=0.1)(x)

  # Layer 8
  x = Conv2D(256, (3,3), strides=(1,1), padding='same', name='conv_8', use_bias=False)(x)
  x = BatchNormalization(name='norm_8')(x)
  x = LeakyReLU(alpha=0.1)(x)
  x = MaxPooling2D(pool_size=(2, 2))(x)

  # Layer 9
  x = Conv2D(512, (3,3), strides=(1,1), padding='same', name='conv_9', use_bias=False)(x)
  x = BatchNormalization(name='norm_9')(x)
  x = LeakyReLU(alpha=0.1)(x)

  # Layer 10
  x = Conv2D(256, (1,1), strides=(1,1), padding='same', name='conv_10', use_bias=False)(x)
  x = BatchNormalization(name='norm_10')(x)
  x = LeakyReLU(alpha=0.1)(x)

  # Layer 11
  x = Conv2D(512, (3,3), strides=(1,1), padding='same', name='conv_11', use_bias=False)(x)
  x = BatchNormalization(name='norm_11')(x)
  x = LeakyReLU(alpha=0.1)(x)

  # Layer 12
  x = Conv2D(256, (1,1), strides=(1,1), padding='same', name='conv_12', use_bias=False)(x)
  x = BatchNormalization(name='norm_12')(x)
  x = LeakyReLU(alpha=0.1)(x)

  # Layer 13
  x = Conv2D(512, (3,3), strides=(1,1), padding='same', name='conv_13', use_bias=False)(x)
  x = BatchNormalization(name='norm_13')(x)
  x = LeakyReLU(alpha=0.1)(x)

  skip_connection = x

  x = MaxPooling2D(pool_size=(2, 2))(x)

  # Layer 14
  x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_14', use_bias=False)(x)
  x = BatchNormalization(name='norm_14')(x)
  x = LeakyReLU(alpha=0.1)(x)

  # Layer 15
  x = Conv2D(512, (1,1), strides=(1,1), padding='same', name='conv_15', use_bias=False)(x)
  x = BatchNormalization(name='norm_15')(x)
  x = LeakyReLU(alpha=0.1)(x)

  # Layer 16
  x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_16', use_bias=False)(x)
  x = BatchNormalization(name='norm_16')(x)
  x = LeakyReLU(alpha=0.1)(x)

  # Layer 17
  x = Conv2D(512, (1,1), strides=(1,1), padding='same', name='conv_17', use_bias=False)(x)
  x = BatchNormalization(name='norm_17')(x)
  x = LeakyReLU(alpha=0.1)(x)

  # Layer 18
  x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_18', use_bias=False)(x)
  x = BatchNormalization(name='norm_18')(x)
  x = LeakyReLU(alpha=0.1)(x)

  # Layer 19
  x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_19', use_bias=False)(x)
  x = BatchNormalization(name='norm_19')(x)
  x = LeakyReLU(alpha=0.1)(x)

  # Layer 20
  x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_20', use_bias=False)(x)
  x = BatchNormalization(name='norm_20')(x)
  x = LeakyReLU(alpha=0.1)(x)

  # Layer 21
  skip_connection = Conv2D(64, (1,1), strides=(1,1), padding='same', name='conv_21', use_bias=False)(skip_connection)
  skip_connection = BatchNormalization(name='norm_21')(skip_connection)
  skip_connection = LeakyReLU(alpha=0.1)(skip_connection)
  skip_connection = Lambda(space_to_depth_x2)(skip_connection)

  x = concatenate([skip_connection, x])

  # Layer 22
  x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_22', use_bias=False)(x)
  x = BatchNormalization(name='norm_22')(x)
  x = LeakyReLU(alpha=0.1)(x)

  if anc_box==True:
    # Layer 23
    x = Conv2D(BOX*(4 + 1 + CLASS), (1,1), strides=(1,1), padding='same', name='conv_23')(x)
    output = Reshape((GRID_H, GRID_W,BOX,4 + 1 + CLASS))(x)
  else :
    # Layer 23
    x = Conv2D((4 + 1 + CLASS), (1,1), strides=(1,1), padding='same', name='conv_23')(x)
    output = Reshape((GRID_H, GRID_W,4 + 1 + CLASS))(x)

  # small hack to allow true_boxes to be registered when Keras build the model 
  # for more information: https://github.com/fchollet/keras/issues/2790
  #output = Lambda(lambda args: args[0])([output, true_boxes])#Change :Hasib

  #model = Model([input_image, true_boxes], output)#Change :Hasib
  model = Model(input_image, output)
  return model
  #model.load_weights('/content/drive/My Drive/Data/yolo_net_ep500_act.h5')


class WeightReader:
    def __init__(self, weight_file):
        self.offset = 4
        self.all_weights = np.fromfile(weight_file, dtype='float32')
        
    def read_bytes(self, size):
        self.offset = self.offset + size
        return self.all_weights[self.offset-size:self.offset]
    
    def reset(self):
        self.offset = 4

def load_weights(model,path_wst):
  wt_path = path_wts                      
  weight_reader = WeightReader(wt_path)
  weight_reader.reset()
  nb_conv = 23

  for i in range(1, nb_conv+1):
      conv_layer = model.get_layer('conv_' + str(i))

      if i < nb_conv:
          norm_layer = model.get_layer('norm_' + str(i))

          size = np.prod(norm_layer.get_weights()[0].shape)

          beta  = weight_reader.read_bytes(size)
          gamma = weight_reader.read_bytes(size)
          mean  = weight_reader.read_bytes(size)
          var   = weight_reader.read_bytes(size)

          weights = norm_layer.set_weights([gamma, beta, mean, var])       

      if len(conv_layer.get_weights()) > 1:
          bias   = weight_reader.read_bytes(np.prod(conv_layer.get_weights()[1].shape))
          kernel = weight_reader.read_bytes(np.prod(conv_layer.get_weights()[0].shape))
          kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape)))
          kernel = kernel.transpose([2,3,1,0])
          conv_layer.set_weights([kernel, bias])
      else:
          kernel = weight_reader.read_bytes(np.prod(conv_layer.get_weights()[0].shape))
          kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape)))
          kernel = kernel.transpose([2,3,1,0])
          conv_layer.set_weights([kernel])
  return model



def yolo_loss_1(y_true, y_pred):
      loss=0
          ### adjust w and h
      obj_mask_ex= tf.expand_dims(y_true[..., 4], axis=-1)
      obj_mask= y_true[..., 4]
      #conf_obj_mask=y_true[...,4]
      noobj_mask=np.abs(y_true[...,4]-1)
      
      if anc_box==True and _grid_offset==True:
        _x = tf.to_float(tf.reshape(tf.tile(tf.range(GRID_W), [GRID_H]), (1, GRID_H, GRID_W, 1, 1)))#1,13,13,1,1
        _y = tf.transpose(_x, (0,2,1,3,4))#1,13,13,1,1
        _grid = tf.tile(tf.concat([_x,_y], -1), [BATCH_SIZE, 1, 1, 5, 1])#10,13,13,5,1

        pred_xy = tf.sigmoid(y_pred[..., :2]) + _grid
        pred_wh = tf.exp(y_pred[..., 2:4]) * np.reshape(ANCHORS, [1,1,1,BOX,2])
        
      if anc_box==False and _grid_offset==True:
        _x = tf.to_float(tf.reshape(tf.tile(tf.range(GRID_W), [GRID_H]), (1, GRID_H, GRID_W, 1)))#1,13,13,1,1
        _y = tf.transpose(_x, (0,2,1,3))#1,13,13,1
        _grid = tf.tile(tf.concat([_x,_y], -1), [BATCH_SIZE, 1, 1, 1])#10,13,13,1

        pred_xy = tf.sigmoid(y_pred[..., :2]) + _grid
        pred_wh = y_pred[..., 2:4]# * np.reshape(ANCHORS, [1,1,1,BOX,2])
        #pred_wh = tf.exp(y_pred[..., 2:4]) * np.reshape(ANCHORS, [1,1,1,BOX,2])
      
      if _grid_offset==False:
        pred_xy=y_pred[...,0:2] #+ cell_grid # if cell_grid Batch_Gen center_x -=grid_x
        pred_wh=y_pred[...,2:4]
      

      true_xy=y_true[...,0:2]
      true_wh=y_true[...,2:4]
      pred_conf=y_pred[...,4]
      ### adjust confidence
      true_wh_half = true_wh / 2.
      true_mins    = tf.subtract(true_xy,true_wh_half)
      true_maxes   = tf.add(true_xy,true_wh_half)
    
      pred_wh_half = pred_wh / 2.
      pred_mins    = tf.subtract(pred_xy,pred_wh_half)
      pred_maxes   = tf.add(pred_xy,pred_wh_half)       
    
      intersect_mins  = tf.maximum(pred_mins,  true_mins)
      intersect_maxes = tf.minimum(pred_maxes, true_maxes)
      intersect_wh    = tf.maximum(intersect_maxes - intersect_mins, 0.)
      intersect_areas = tf.multiply(intersect_wh[..., 0] , intersect_wh[..., 1])
    
      true_areas = tf.multiply(true_wh[..., 0] , true_wh[..., 1])
      pred_areas = tf.multiply(pred_wh[..., 0] , pred_wh[..., 1])

      union_areas =tf.subtract(tf.add(pred_areas,true_areas),intersect_areas)
      intersect_areas=tf.add(intersect_areas,1)
      union_areas=tf.add(union_areas,1)
      iou_scores  = tf.truediv(intersect_areas, union_areas)
      true_box_class = tf.argmax(y_true[..., 5:], -1)
      pred_box_class=y_pred[..., 4]
      pred_box_class = y_pred[..., 5:]
      class_mask = y_true[..., 4] * tf.gather(CLASS_WEIGHTS, true_box_class)
    
      #class_mask = y_true[..., 4] * tf.to_float(true_box_class)
      nb_class_box = tf.reduce_sum(tf.to_float(class_mask > 0.0))
      true_conf =tf.multiply( iou_scores,y_true[..., 4])
      
      loss_bb=tf.subtract(true_xy,pred_xy)
      loss_bb=tf.square(loss_bb)
      loss_bb=tf.multiply(loss_bb,obj_mask_ex)
      loss_bb=tf.reduce_sum(loss_bb)
    
      pred_wh_abs=tf.abs(pred_wh)
      pred_wh_abs=tf.where(tf.equal(pred_wh_abs,0),tf.ones_like(pred_wh_abs),pred_wh_abs)
      pred_wh_sign=tf.truediv(pred_wh,pred_wh_abs)
      loss_wh=tf.subtract(tf.sqrt(true_wh),tf.multiply(pred_wh_sign,tf.sqrt(pred_wh_abs)))
      loss_wh=tf.square(loss_wh)
      loss_wh=tf.multiply(loss_wh,obj_mask_ex)
      loss_wh=tf.reduce_sum(loss_wh)
    
      loss_conf=tf.subtract(true_conf,pred_conf)
      loss_conf=tf.square(loss_conf)
      loss_conf=tf.multiply(loss_conf,obj_mask)
      loss_conf=tf.reduce_sum(loss_conf)
    
      loss_noobj_conf=tf.subtract(true_conf,pred_conf)
      loss_noobj_conf=tf.square(loss_noobj_conf)
      loss_noobj_conf=tf.multiply(loss_noobj_conf,noobj_mask)
      loss_noobj_conf=tf.reduce_sum(loss_noobj_conf)
    
      loss_class = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=true_box_class, logits=pred_box_class)
      loss_class = tf.reduce_sum(loss_class * class_mask) / (nb_class_box + 1e-6)
      loss=lambda_coord*loss_bb+lambda_coord*loss_wh+loss_conf+lambda_noobj*loss_noobj_conf+loss_class       
      return loss

Data Ready Block

In [5]:
if colab_run==True :data_dir='/content/drive/My Drive/CNN_Basic/'
else: data_dir=''
from xml.etree import ElementTree as ET

def read_content(xml_file,_dir):
    objs=[]
    tree = ET.parse(xml_file)
    root = tree.getroot()

    list_with_all_boxes = []
    filename = root.find('filename').text
    size=root.find('size')
    img_h=int(size.find('height').text)
    img_w=int(size.find('width').text)
    for boxes in root.iter('object'):
        
        name = boxes.find('name').text
        ymin, xmin, ymax, xmax = None, None, None, None
        
        for box in boxes.findall("bndbox"):
            ymin = int(box.find("ymin").text)
            xmin = int(box.find("xmin").text)
            ymax = int(box.find("ymax").text)
            xmax = int(box.find("xmax").text)
        wf=IMAGE_W/img_w
        hf=IMAGE_H/img_h
        obj={'name':name,
          'xmin':xmin*wf,
          'ymin':ymin*hf,
          'xmax':xmax*wf,
          'ymax':ymax*hf}
        objs.append(obj)
        
    out={
          'filename':_dir+'JPEGImages/'+filename,
          'height':img_h,
          'width':img_w,
          'object':objs
    }
        
    return filename, list_with_all_boxes,out

def IOU(bboxes1, bboxes2):
        #import pdb;pdb.set_trace()
        x1_min, y1_min, x1_max, y1_max = list(bboxes1)
        x2_min, y2_min, x2_max, y2_max = list(bboxes2)
        xA = np.maximum(x1_min, x2_min)
        yA = np.maximum(y1_min, y2_min)
        xB = np.minimum(x1_max, x2_max)
        yB = np.minimum(y1_max, y2_max)
        interArea = np.maximum((xB - xA ), 0) * np.maximum((yB - yA ), 0)
        boxAArea = (x1_max - x1_min ) * (y1_max - y1_min )
        boxBArea = (x2_max - x2_min ) * (y2_max - y2_min )
        iou = interArea / (boxAArea + boxBArea - interArea)
        return iou
    
def Batch_Gen(all_data,no_of_batch):
      while(True):
            N=len(all_data)
            _batch_size=N//no_of_batch

            for _ind in range(no_of_batch):

                batch=all_data[_ind*_batch_size:(_ind+1)*_batch_size]
                n=len(batch)
                x_batch = np.zeros((n,IMAGE_H, IMAGE_W,3),dtype=np.float32)                         # input images
                if anc_box==True:
                  y_batch = np.zeros((n, GRID_H, GRID_W,BOX,4+1+len(LABELS)),dtype=np.float)                # desired network output
                else :
                  y_batch = np.zeros((n, GRID_H, GRID_W,4+1+len(LABELS)),dtype=np.float)                # desired network output
                instance_count=0

                
                for sample in batch:

                        image_name = sample['filename']
                        img = cv2.imread(root+train_dir+image_name)
                        img = cv2.resize(img, (IMAGE_H,IMAGE_W))
                        img = img[:,:,::-1]
                        img_w=sample['height']
                        img_h=sample['width']
                        all_objs = sample['object']
                        # construct output from object's x, y, w, h
                        true_box_index = 0
                        anchors = [[0, 0, ANCHORS[2*i], ANCHORS[2*i+1]] for i in range(int(len(ANCHORS)//2))]
                        for obj in all_objs:
                            
                            if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin'] and obj['name'] in LABELS:
                                center_x = .5*(obj['xmin'] + obj['xmax'])
                                center_x = center_x*GRID_W
                                center_y = .5*(obj['ymin'] + obj['ymax'])
                                center_y=center_y*GRID_H
                                grid_x = int(np.floor(center_x))
                                grid_y = int(np.floor(center_y))
                                if _grid_offset==False:
                                            center_x-=grid_x
                                            center_y-=grid_y
                                #center_x-=grid_x
                                #center_y-=grid_y
                                if grid_x < GRID_W and grid_y < GRID_H:
                                    obj_indx  = LABELS.index(obj['name'])
                                    center_w = (obj['xmax'] - obj['xmin'])*GRID_W #/ (float(self.config['IMAGE_W'])# / self.config['GRID_W']) # unit: grid cell
                                    center_h = (obj['ymax'] - obj['ymin'])*GRID_H #/ (float(self.config['IMAGE_H'])# / self.config['GRID_H']) # unit: grid cell
                                    center_w=center_w
                                    center_h=center_h

                                    box = [center_x, center_y, center_w, center_h]

                                    # find the anchor that best predicts this box#Change :Hasib
                                    best_anchor = -1
                                    max_iou     = -1

                                    shifted_box = [0, 0, center_w, center_h]

                                    for i in range(len(anchors)):
                                        anchor = anchors[i]
                                        iou    = IOU(shifted_box, anchor)

                                        if max_iou < iou:
                                            best_anchor = i
                                            max_iou     = iou

                                    # assign ground truth x, y, w, h, confidence and class probs to y_batch
                                    if anc_box==True:
                                      y_batch[instance_count, grid_y, grid_x, best_anchor,0:4] = box
                                      y_batch[instance_count, grid_y, grid_x, best_anchor,4  ] = 1.
                                      y_batch[instance_count, grid_y, grid_x, best_anchor,5+obj_indx] = 1
                                    else :
                                      y_batch[instance_count, grid_y, grid_x,0:4] = box
                                      y_batch[instance_count, grid_y, grid_x,4  ] = 1.
                                      y_batch[instance_count, grid_y, grid_x,5+obj_indx] = 1

                                    # assign the true box to b_batch
                                    #b_batch[instance_count, 0, 0, 0, true_box_index] = box#Change: Hasib

                                    #true_box_index += 1
                                    #true_box_index = true_box_index % self.config['TRUE_BOX_BUFFER']

                        # assign input image to x_batch
                        x_batch[instance_count] = img/255

                        # increase instance counter in current batch
                        instance_count += 1  

                        #print(' new batch created', idx)
                yield (x_batch, y_batch)

Reading Data

In [6]:
import glob
import cv2
import pandas as pd
import pickle

f=open(root+'COCO/train_coco_dfs','rb')
train_data=pickle.load(f)
f.close()
f=open(root+'COCO/val_coco_dfs','rb')
valid_data=pickle.load(f)
f.close()
f=open(root+'COCO/test_coco_dfs','rb')
test_data=pickle.load(f)
f.close()
In [7]:
if train==True or train==False: 
  #train_imgs=all_data[:n_train]
  N=len(train_data)
  n_train=len(train_data)
  n_valid=len(valid_data)
  train_batch=[]
  valid_batch=[]
  no_of_tr_batch=int(np.floor(n_train/BATCH_SIZE))
  train_batch=Batch_Gen(train_data,no_of_tr_batch)
  no_of_val_batch=int(np.floor(n_valid/BATCH_SIZE))
  valid_batch=Batch_Gen(valid_data,no_of_val_batch)
In [8]:
name=[]
class_wt=pd.Series([0]*len(LABELS),index=LABELS)
for sample in train_data:  
    names=[obj['name'] for obj in sample['object']] 
    for label in names:
        class_wt[label]+=1
class_weights=class_wt.sum()-class_wt#.values
class_weights=class_weights/class_weights.max()
#CLASS_WEIGHTS=np.array(class_weights.values,dtype=np.float32)
In [9]:
for i in range(30,60):
    image_h=(valid_data[i]['height'])
    image_w=(valid_data[i]['width'])
    img=cv2.imread(root+val_dir+valid_data[i]['filename'])
    for obj in valid_data[i]['object']:
        print(obj['name'],valid_data[i]['filename'])
        xmin,ymin,xmax,ymax=list(obj.values())[1:]
        cv2.rectangle(img, (int(xmin*image_w),int(ymin*image_h)), (int(xmax*image_w),int(ymax*image_h)), (0,255,0), 3)
    plt.imshow(img)
    plt.show()
person 000000002532.jpg
skis 000000002532.jpg
banana 000000002587.jpg
donut 000000002587.jpg
dining table 000000002592.jpg
cup 000000002592.jpg
knife 000000002592.jpg
bottle 000000002685.jpg
bottle 000000002685.jpg
bottle 000000002685.jpg
bottle 000000002685.jpg
bottle 000000002685.jpg
bottle 000000002685.jpg
bottle 000000002685.jpg
person 000000002685.jpg
person 000000002685.jpg
person 000000002685.jpg
person 000000002685.jpg
bottle 000000002685.jpg
wine glass 000000002685.jpg
cup 000000002685.jpg
person 000000002685.jpg
handbag 000000002685.jpg
wine glass 000000002685.jpg
cup 000000002685.jpg
person 000000002685.jpg
bird 000000002923.jpg
bird 000000002923.jpg
bird 000000002923.jpg
boat 000000002923.jpg
boat 000000002923.jpg
boat 000000002923.jpg
boat 000000002923.jpg
person 000000003156.jpg
sink 000000003156.jpg
toilet 000000003156.jpg
person 000000003255.jpg
person 000000003255.jpg
person 000000003255.jpg
person 000000003255.jpg
skis 000000003255.jpg
backpack 000000003255.jpg
backpack 000000003255.jpg
person 000000003255.jpg
backpack 000000003255.jpg
person 000000003255.jpg
broccoli 000000003501.jpg
broccoli 000000003501.jpg
bowl 000000003501.jpg
person 000000003553.jpg
skateboard 000000003553.jpg
banana 000000003661.jpg
keyboard 000000003661.jpg
cup 000000003661.jpg
dining table 000000003845.jpg
cup 000000003845.jpg
fork 000000003845.jpg
broccoli 000000003845.jpg
broccoli 000000003845.jpg
broccoli 000000003845.jpg
carrot 000000003845.jpg
carrot 000000003845.jpg
carrot 000000003845.jpg
carrot 000000003845.jpg
spoon 000000003845.jpg
carrot 000000003845.jpg
couch 000000003934.jpg
person 000000003934.jpg
person 000000003934.jpg
person 000000003934.jpg
cup 000000003934.jpg
remote 000000003934.jpg
remote 000000003934.jpg
person 000000003934.jpg
person 000000003934.jpg
wine glass 000000003934.jpg
wine glass 000000003934.jpg
wine glass 000000003934.jpg
person 000000003934.jpg
person 000000003934.jpg
tie 000000004134.jpg
person 000000004134.jpg
person 000000004134.jpg
person 000000004134.jpg
person 000000004134.jpg
person 000000004134.jpg
person 000000004134.jpg
person 000000004134.jpg
person 000000004134.jpg
person 000000004134.jpg
wine glass 000000004134.jpg
wine glass 000000004134.jpg
wine glass 000000004134.jpg
dining table 000000004134.jpg
person 000000004134.jpg
tie 000000004134.jpg
tie 000000004134.jpg
wine glass 000000004134.jpg
chair 000000004134.jpg
chair 000000004134.jpg
person 000000004134.jpg
person 000000004134.jpg
chair 000000004134.jpg
dining table 000000004134.jpg
person 000000004134.jpg
dining table 000000004134.jpg
tie 000000004395.jpg
person 000000004395.jpg
tv 000000004495.jpg
couch 000000004495.jpg
chair 000000004495.jpg
person 000000004765.jpg
surfboard 000000004765.jpg
cat 000000004795.jpg
laptop 000000004795.jpg
tv 000000004795.jpg
scissors 000000005001.jpg
person 000000005001.jpg
person 000000005001.jpg
person 000000005001.jpg
person 000000005001.jpg
person 000000005001.jpg
person 000000005001.jpg
person 000000005001.jpg
person 000000005001.jpg
person 000000005001.jpg
person 000000005001.jpg
person 000000005001.jpg
person 000000005001.jpg
handbag 000000005001.jpg
bicycle 000000005001.jpg
person 000000005001.jpg
scissors 000000005001.jpg
bus 000000005037.jpg
car 000000005037.jpg
person 000000005037.jpg
person 000000005037.jpg
person 000000005037.jpg
person 000000005037.jpg
person 000000005037.jpg
person 000000005037.jpg
person 000000005037.jpg
car 000000005037.jpg
cell phone 000000005060.jpg
person 000000005060.jpg
person 000000005193.jpg
person 000000005193.jpg
person 000000005193.jpg
person 000000005193.jpg
person 000000005193.jpg
surfboard 000000005193.jpg
surfboard 000000005193.jpg
bottle 000000005193.jpg
person 000000005193.jpg
airplane 000000005477.jpg
airplane 000000005477.jpg
toilet 000000005503.jpg
person 000000005529.jpg
skis 000000005529.jpg
person 000000005586.jpg
person 000000005586.jpg
tennis racket 000000005586.jpg
person 000000005586.jpg
person 000000005586.jpg
person 000000005586.jpg
person 000000005586.jpg
person 000000005586.jpg
person 000000005586.jpg
person 000000005586.jpg
person 000000005586.jpg
person 000000005586.jpg
person 000000005586.jpg
person 000000005586.jpg
spoon 000000005600.jpg
bowl 000000005600.jpg
bowl 000000005600.jpg
sheep 000000005992.jpg
sheep 000000005992.jpg
sheep 000000005992.jpg
sheep 000000005992.jpg
sheep 000000005992.jpg
banana 000000006012.jpg
banana 000000006012.jpg
train 000000006040.jpg
person 000000006040.jpg
person 000000006040.jpg
person 000000006040.jpg
car 000000006040.jpg
person 000000006040.jpg
person 000000006040.jpg
person 000000006040.jpg
person 000000006040.jpg
person 000000006040.jpg
person 000000006040.jpg
truck 000000006040.jpg
sink 000000006213.jpg
sink 000000006213.jpg
In [10]:
image_h=(test_data[i]['height']/416)
test_data[i]['height']/416
Out[10]:
1.5384615384615385

data_check=True if data_check==True: sampleX,sampleY=next(train_batch) plt.imshow(sampleX[3]) plt.show() img=sampleY[3][...,3,11+5] plt.imshow(img) plt.show() for i in range(4,5): print('index',i) plt.imshow(sampleY[3][...,3,i]*10) plt.show()

Train Block

In [11]:
model=build_model()
model.summary()
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_1 (InputLayer)            (None, 608, 608, 3)  0                                            
__________________________________________________________________________________________________
conv_1 (Conv2D)                 (None, 608, 608, 32) 864         input_1[0][0]                    
__________________________________________________________________________________________________
norm_1 (BatchNormalization)     (None, 608, 608, 32) 128         conv_1[0][0]                     
__________________________________________________________________________________________________
leaky_re_lu (LeakyReLU)         (None, 608, 608, 32) 0           norm_1[0][0]                     
__________________________________________________________________________________________________
max_pooling2d (MaxPooling2D)    (None, 304, 304, 32) 0           leaky_re_lu[0][0]                
__________________________________________________________________________________________________
conv_2 (Conv2D)                 (None, 304, 304, 64) 18432       max_pooling2d[0][0]              
__________________________________________________________________________________________________
norm_2 (BatchNormalization)     (None, 304, 304, 64) 256         conv_2[0][0]                     
__________________________________________________________________________________________________
leaky_re_lu_1 (LeakyReLU)       (None, 304, 304, 64) 0           norm_2[0][0]                     
__________________________________________________________________________________________________
max_pooling2d_1 (MaxPooling2D)  (None, 152, 152, 64) 0           leaky_re_lu_1[0][0]              
__________________________________________________________________________________________________
conv_3 (Conv2D)                 (None, 152, 152, 128 73728       max_pooling2d_1[0][0]            
__________________________________________________________________________________________________
norm_3 (BatchNormalization)     (None, 152, 152, 128 512         conv_3[0][0]                     
__________________________________________________________________________________________________
leaky_re_lu_2 (LeakyReLU)       (None, 152, 152, 128 0           norm_3[0][0]                     
__________________________________________________________________________________________________
conv_4 (Conv2D)                 (None, 152, 152, 64) 8192        leaky_re_lu_2[0][0]              
__________________________________________________________________________________________________
norm_4 (BatchNormalization)     (None, 152, 152, 64) 256         conv_4[0][0]                     
__________________________________________________________________________________________________
leaky_re_lu_3 (LeakyReLU)       (None, 152, 152, 64) 0           norm_4[0][0]                     
__________________________________________________________________________________________________
conv_5 (Conv2D)                 (None, 152, 152, 128 73728       leaky_re_lu_3[0][0]              
__________________________________________________________________________________________________
norm_5 (BatchNormalization)     (None, 152, 152, 128 512         conv_5[0][0]                     
__________________________________________________________________________________________________
leaky_re_lu_4 (LeakyReLU)       (None, 152, 152, 128 0           norm_5[0][0]                     
__________________________________________________________________________________________________
max_pooling2d_2 (MaxPooling2D)  (None, 76, 76, 128)  0           leaky_re_lu_4[0][0]              
__________________________________________________________________________________________________
conv_6 (Conv2D)                 (None, 76, 76, 256)  294912      max_pooling2d_2[0][0]            
__________________________________________________________________________________________________
norm_6 (BatchNormalization)     (None, 76, 76, 256)  1024        conv_6[0][0]                     
__________________________________________________________________________________________________
leaky_re_lu_5 (LeakyReLU)       (None, 76, 76, 256)  0           norm_6[0][0]                     
__________________________________________________________________________________________________
conv_7 (Conv2D)                 (None, 76, 76, 128)  32768       leaky_re_lu_5[0][0]              
__________________________________________________________________________________________________
norm_7 (BatchNormalization)     (None, 76, 76, 128)  512         conv_7[0][0]                     
__________________________________________________________________________________________________
leaky_re_lu_6 (LeakyReLU)       (None, 76, 76, 128)  0           norm_7[0][0]                     
__________________________________________________________________________________________________
conv_8 (Conv2D)                 (None, 76, 76, 256)  294912      leaky_re_lu_6[0][0]              
__________________________________________________________________________________________________
norm_8 (BatchNormalization)     (None, 76, 76, 256)  1024        conv_8[0][0]                     
__________________________________________________________________________________________________
leaky_re_lu_7 (LeakyReLU)       (None, 76, 76, 256)  0           norm_8[0][0]                     
__________________________________________________________________________________________________
max_pooling2d_3 (MaxPooling2D)  (None, 38, 38, 256)  0           leaky_re_lu_7[0][0]              
__________________________________________________________________________________________________
conv_9 (Conv2D)                 (None, 38, 38, 512)  1179648     max_pooling2d_3[0][0]            
__________________________________________________________________________________________________
norm_9 (BatchNormalization)     (None, 38, 38, 512)  2048        conv_9[0][0]                     
__________________________________________________________________________________________________
leaky_re_lu_8 (LeakyReLU)       (None, 38, 38, 512)  0           norm_9[0][0]                     
__________________________________________________________________________________________________
conv_10 (Conv2D)                (None, 38, 38, 256)  131072      leaky_re_lu_8[0][0]              
__________________________________________________________________________________________________
norm_10 (BatchNormalization)    (None, 38, 38, 256)  1024        conv_10[0][0]                    
__________________________________________________________________________________________________
leaky_re_lu_9 (LeakyReLU)       (None, 38, 38, 256)  0           norm_10[0][0]                    
__________________________________________________________________________________________________
conv_11 (Conv2D)                (None, 38, 38, 512)  1179648     leaky_re_lu_9[0][0]              
__________________________________________________________________________________________________
norm_11 (BatchNormalization)    (None, 38, 38, 512)  2048        conv_11[0][0]                    
__________________________________________________________________________________________________
leaky_re_lu_10 (LeakyReLU)      (None, 38, 38, 512)  0           norm_11[0][0]                    
__________________________________________________________________________________________________
conv_12 (Conv2D)                (None, 38, 38, 256)  131072      leaky_re_lu_10[0][0]             
__________________________________________________________________________________________________
norm_12 (BatchNormalization)    (None, 38, 38, 256)  1024        conv_12[0][0]                    
__________________________________________________________________________________________________
leaky_re_lu_11 (LeakyReLU)      (None, 38, 38, 256)  0           norm_12[0][0]                    
__________________________________________________________________________________________________
conv_13 (Conv2D)                (None, 38, 38, 512)  1179648     leaky_re_lu_11[0][0]             
__________________________________________________________________________________________________
norm_13 (BatchNormalization)    (None, 38, 38, 512)  2048        conv_13[0][0]                    
__________________________________________________________________________________________________
leaky_re_lu_12 (LeakyReLU)      (None, 38, 38, 512)  0           norm_13[0][0]                    
__________________________________________________________________________________________________
max_pooling2d_4 (MaxPooling2D)  (None, 19, 19, 512)  0           leaky_re_lu_12[0][0]             
__________________________________________________________________________________________________
conv_14 (Conv2D)                (None, 19, 19, 1024) 4718592     max_pooling2d_4[0][0]            
__________________________________________________________________________________________________
norm_14 (BatchNormalization)    (None, 19, 19, 1024) 4096        conv_14[0][0]                    
__________________________________________________________________________________________________
leaky_re_lu_13 (LeakyReLU)      (None, 19, 19, 1024) 0           norm_14[0][0]                    
__________________________________________________________________________________________________
conv_15 (Conv2D)                (None, 19, 19, 512)  524288      leaky_re_lu_13[0][0]             
__________________________________________________________________________________________________
norm_15 (BatchNormalization)    (None, 19, 19, 512)  2048        conv_15[0][0]                    
__________________________________________________________________________________________________
leaky_re_lu_14 (LeakyReLU)      (None, 19, 19, 512)  0           norm_15[0][0]                    
__________________________________________________________________________________________________
conv_16 (Conv2D)                (None, 19, 19, 1024) 4718592     leaky_re_lu_14[0][0]             
__________________________________________________________________________________________________
norm_16 (BatchNormalization)    (None, 19, 19, 1024) 4096        conv_16[0][0]                    
__________________________________________________________________________________________________
leaky_re_lu_15 (LeakyReLU)      (None, 19, 19, 1024) 0           norm_16[0][0]                    
__________________________________________________________________________________________________
conv_17 (Conv2D)                (None, 19, 19, 512)  524288      leaky_re_lu_15[0][0]             
__________________________________________________________________________________________________
norm_17 (BatchNormalization)    (None, 19, 19, 512)  2048        conv_17[0][0]                    
__________________________________________________________________________________________________
leaky_re_lu_16 (LeakyReLU)      (None, 19, 19, 512)  0           norm_17[0][0]                    
__________________________________________________________________________________________________
conv_18 (Conv2D)                (None, 19, 19, 1024) 4718592     leaky_re_lu_16[0][0]             
__________________________________________________________________________________________________
norm_18 (BatchNormalization)    (None, 19, 19, 1024) 4096        conv_18[0][0]                    
__________________________________________________________________________________________________
leaky_re_lu_17 (LeakyReLU)      (None, 19, 19, 1024) 0           norm_18[0][0]                    
__________________________________________________________________________________________________
conv_19 (Conv2D)                (None, 19, 19, 1024) 9437184     leaky_re_lu_17[0][0]             
__________________________________________________________________________________________________
norm_19 (BatchNormalization)    (None, 19, 19, 1024) 4096        conv_19[0][0]                    
__________________________________________________________________________________________________
conv_21 (Conv2D)                (None, 38, 38, 64)   32768       leaky_re_lu_12[0][0]             
__________________________________________________________________________________________________
leaky_re_lu_18 (LeakyReLU)      (None, 19, 19, 1024) 0           norm_19[0][0]                    
__________________________________________________________________________________________________
norm_21 (BatchNormalization)    (None, 38, 38, 64)   256         conv_21[0][0]                    
__________________________________________________________________________________________________
conv_20 (Conv2D)                (None, 19, 19, 1024) 9437184     leaky_re_lu_18[0][0]             
__________________________________________________________________________________________________
leaky_re_lu_20 (LeakyReLU)      (None, 38, 38, 64)   0           norm_21[0][0]                    
__________________________________________________________________________________________________
norm_20 (BatchNormalization)    (None, 19, 19, 1024) 4096        conv_20[0][0]                    
__________________________________________________________________________________________________
lambda (Lambda)                 (None, 19, 19, 256)  0           leaky_re_lu_20[0][0]             
__________________________________________________________________________________________________
leaky_re_lu_19 (LeakyReLU)      (None, 19, 19, 1024) 0           norm_20[0][0]                    
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 19, 19, 1280) 0           lambda[0][0]                     
                                                                 leaky_re_lu_19[0][0]             
__________________________________________________________________________________________________
conv_22 (Conv2D)                (None, 19, 19, 1024) 11796480    concatenate[0][0]                
__________________________________________________________________________________________________
norm_22 (BatchNormalization)    (None, 19, 19, 1024) 4096        conv_22[0][0]                    
__________________________________________________________________________________________________
leaky_re_lu_21 (LeakyReLU)      (None, 19, 19, 1024) 0           norm_22[0][0]                    
__________________________________________________________________________________________________
conv_23 (Conv2D)                (None, 19, 19, 425)  435625      leaky_re_lu_21[0][0]             
__________________________________________________________________________________________________
reshape (Reshape)               (None, 19, 19, 5, 85 0           conv_23[0][0]                    
==================================================================================================
Total params: 50,983,561
Trainable params: 50,962,889
Non-trainable params: 20,672
__________________________________________________________________________________________________
In [13]:
def train_model(model):
  #filepath="D:/"+exp_name+"weights-improvement-{epoch:02d}-{val_loss:.2f}.hdf5"
  filepath=root+exp_name+"best.hdf5"
  checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
  callbacks_list = [checkpoint]
  model.compile(loss=yolo_loss_1, optimizer=optimizer,metrics=['accuracy'])
  if anc_box==True:
    model.compile(loss=yolo_loss_1, optimizer=optimizer,metrics=['accuracy'])
  history=model.fit_generator(train_batch,
                              steps_per_epoch=no_of_tr_batch,
                              epochs=epochs,
                              callbacks=callbacks_list,
                              validation_data=valid_batch,
                              validation_steps=no_of_val_batch)

  
  return model,history
In [14]:
if train==True:
    model=load_weights(model,path_wts)
    for layer in model.layers[:-31]:
        layer.trainable=False
    model,history=train_model(model)
    
    
    plt.plot(history.history['acc'])
    plt.plot(history.history['loss'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    
    # summarize history for loss
    plt.plot(range(5,epochs),history.history['loss'][5:])
    plt.plot(range(5,epochs),history.history['val_loss'][5:])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train_loss', 'validation_loss'], loc='upper left')
    plt.savefig(root+exp_name+'yolo-v1_exp.png')
    plt.show()
    

    f=open(root+exp_name+'history:_ep_70_v-1.0_exp','wb')
    pickle.dump([history.history['loss'],history.history['val_loss']],f)
    f.close()
    val_loss=np.array(history.history['val_loss'])
    best_epoch=np.argmin(val_loss) 
    print(best_epoch,val_loss[best_epoch])
if test_mode==1:
    model.load_weights(path_wts_final)
if test_mode==2:
    model=load_weights(model,path_wts)

Predicion Block

In [15]:
def _softmax(x, axis=-1, t=-100.):
    x = x - np.max(x)
    
    if np.min(x) < t:
        x = x/np.min(x)*t
        
    e_x = np.exp(x)
    
    return e_x / e_x.sum(axis, keepdims=True)

def _sigmoid(x):
    return 1. / (1. + np.exp(-x))

def nmax_supp(boxes):
    df=pd.DataFrame(boxes,columns=['x','y','w','h','conf','_class'])
    sdf=df.sort_values('conf',ascending=False).reset_index(drop=True)
    #print(sdf)
    for ind1 in range(len(sdf)-1):
        box1=sdf.loc[ind1,['x','y','w','h']].values
        b1_class=sdf.loc[ind1,['_class']].values
        for ind2 in range(ind1+1,len(sdf)):
            b2_class=sdf.loc[ind2,['_class']].values
            b2_conf=sdf.loc[ind2,['conf']].values
            if b2_class==b1_class and b2_conf>0.0:
                 box2=sdf.loc[ind2,['x','y','w','h']].values
                 if b1_class==b2_class and IOU(box1,box2)>=0.1:
                    sdf.loc[ind2,'conf']=0.0
    ndf=sdf[sdf.conf>0.0]
    return list(ndf.values)#[:len(ndf.values)//2]

def decode_netout_anc(netout, anchors, nb_class, obj_threshold=0.3, nms_threshold=0.3):
    #grid_h, grid_w, nb_box = netout.shape[:3]
    grid_h, grid_w = netout.shape[:2]
    
    boxes = []
    # decode the output by the network
    netout[..., 4]  = _sigmoid(netout[..., 4])
    netout[..., 5:] = netout[..., 4][..., np.newaxis] * _softmax(netout[..., 5:])
    netout[..., 5:] *= netout[..., 5:] > obj_threshold

    for row in range(grid_h):
        for col in range(grid_w):
            for b in range(BOX):
                # from 4th element onwards are confidence and class classes
                classes = netout[row,col,b,5:]
                #classes = netout[row,col,5:]
                confidence = netout[row,col,b,4]
                if np.sum(classes) > 0:
                    x, y, w, h = netout[row,col,b,:4]
                    x = (col + _sigmoid(x)) / grid_w # center position, unit: image width
                    y = (row + _sigmoid(y)) / grid_h # center position, unit: image height
                    w = ANCHORS[2 * b + 0] * np.exp(w) / grid_w # unit: image width
                    h = ANCHORS[2 * b + 1] * np.exp(h) / grid_h # unit: image height
                    #print(x,y,w,h)
                    classes=np.argmax(classes)
                    
                    box = (x-w/2, y-h/2, x+w/2, y+h/2, confidence, classes)
                    box = (x-w/2, y-h/2, x+w/2, y+h/2, confidence, classes)
                    boxes.append(box)   
    f_boxes=nmax_supp(boxes)
    return f_boxes    

def decode_netout_1(netout, anchors, nb_class, obj_threshold=0.3, nms_threshold=0.3):
    #grid_h, grid_w, nb_box = netout.shape[:3]
    grid_h, grid_w = netout.shape[:2]
    boxes = []
    
    # decode the output by the network
    netout[..., 4]  = _sigmoid(netout[..., 4])
    netout[..., 5:] = netout[..., 4][..., np.newaxis] * _softmax(netout[..., 5:])
    netout[..., 5:] *= netout[..., 5:] > obj_threshold
    class_prob_log=[0]*CLASS
    #boxes=[(0,)*6]*CLASS
    boxes=[]
    for row in range(grid_h):
        for col in range(grid_w):
            #for b in range(BOX):
                # from 4th element onwards are confidence and class classes
                classes = netout[row,col,5:]
                #classes = netout[row,col,5:]
                confidence = netout[row,col,4]
                if np.sum(classes) > 0:
                    # first 4 elements are x, y, w, and h
                    #x, y, w, h = netout[row,col,b,:4]
                    x, y, w, h = netout[row,col,:4]
                    #print(col,_sigmoid(x-col),row,_sigmoid(y),w,h)
                    w_ratio=(IMAGE_W/GRID_W)/IMAGE_W
                    h_ratio=(IMAGE_W/GRID_W)/IMAGE_H
                    x = (col+_sigmoid(x))*(h_ratio)#x*(32/416) # center position, unit: image width
                    y = (row+_sigmoid(y))*(w_ratio)#y*(32/416) # center position, unit: image height
                    w = w*(w_ratio) # unit: image width
                    h = h*(h_ratio) # unit: image height
                    #print(x,y,w,h)
                    class_ind=np.argmax(classes)
                    #if class_prob_log[class_ind]< classes[class_ind]:       
                    class_prob_log[class_ind]=classes[class_ind]
                    box = (x-w/2, y-h/2, x+w/2, y+h/2, confidence, class_ind)
                    #if abs(box[0])<=1 and abs(box[1])<=1 and box[2]<=1 and box[3]<=1 :
                    #    if (box[0])>=0 and (box[1])>=0 and box[2]>=0 and box[3]>=0 :
                    boxes.append(box)
                                       #boxes[class_ind]=box

                      
    
    f_boxes=nmax_supp(boxes)
    return f_boxes#[:5]   

def draw_boxes_1(image, boxes, labels,t_lbl=None):
    image_h, image_w, _ = image.shape
    wf=image_w/IMAGE_W
    hf=image_h/IMAGE_H
    show_gt=False
    if show_gt==True:
        for box in t_lbl:
            box=list(box.values())
            xmin = int(box[1]*wf)
            ymin = int(box[2]*hf)
            xmax = int(box[3]*wf)
            ymax = int(box[4]*hf)

            cv2.rectangle(image, (xmin,ymin), (xmax,ymax), (255,0,0), 3)
            cv2.putText(image, 
                        box[0] + ' ', 
                        (xmin, ymin - 13), 
                        cv2.FONT_HERSHEY_SIMPLEX, 
                        0.0015 * 400, 
                        (255,0,0), 2)
    for box in boxes:
        xmin = int(box[0]*image_w)
        ymin = int(box[1]*image_h)
        xmax = int(box[2]*image_w)
        ymax = int(box[3]*image_h)
        cv2.rectangle(image, (xmin,ymin), (xmax,ymax), (0,255,0), 3)
        cv2.putText(image, 
                    labels[int(box[5])] + ' ' + str(box[4]), 
                    (xmin, ymin - 13), 
                    cv2.FONT_HERSHEY_SIMPLEX, 
                    0.0015 * 400, 
                    (0,255,0), 2)
        
    return image
In [16]:
import json
import pickle
test_on_val =False
test_on_test=True
test_server=True
calc_map=False
In [17]:
error=0
count=0
import pandas as pd
from copy import copy, deepcopy
from tqdm import tqdm,trange
result_df=pd.DataFrame({},columns=['class','iou','correct','True','img','boxx1','boxy1','boxx2','boxy2','conf'])
ind_loc=0
total_true_per_class={}
for _class in LABELS:
        total_true_per_class[_class]=0

if not os.path.exists('input/ground-truth'):
    os.mkdir('input/ground-truth')
if not os.path.exists('input/detection-results'):
    os.mkdir('input/detection-results')

pred_dicts=[]
for i in trange(0,len(test_data)):
  count+=1
  if test_on_test==True:
     sample=test_data[i]
     image = cv2.imread('D:/COCO/test2017/test2017/'+sample['filename'])
  if test_on_val==True:
     sample=valid_data[i]
     image = cv2.imread('D:/COCO/val2017/val2017/'+sample['filename'])
  image_h,image_w,_=image.shape
  input_image = cv2.resize(image, (IMAGE_W, IMAGE_H))
  input_image = input_image / 255.
  input_image = np.expand_dims(input_image, 0)
  netout = model.predict(input_image)
  
  if anc_box==True:
    pred_labels = decode_netout_anc(netout[0], 
                        obj_threshold=0.30,
                        nms_threshold=.90,
                        anchors=ANCHORS, 
                        nb_class=CLASS)
  if test_server==True:
  
      for pred_box in pred_labels:
          _x= pred_box[0]
          _y= pred_box[1]
          _width=(pred_box[2]-_x)
          _height=(pred_box[3]-_y)
          _class=list(_LABELS.keys())[int(pred_box[5])]
          pred={ 
              "image_id": int(sample['id']), 
              "category_id": int(_class), 
              "bbox": [round(_x*image_w,2),round(_y*image_h,2),round(_width*image_w,2),round(_height*image_h,2)], 
              "score": float(pred_box[4]),
          }
          true_labels=None
          pred_dicts.append(pred)
      
  if calc_map==True: 
      true_labels=sample['object']
    
      true_lb={}
      pred_lb={}
      for lbl in LABELS:
           true_lb[lbl]=[]
           pred_lb[lbl]=[]
      f=open('input/ground-truth/'+str(i)+'.txt','w')
      str_true=''
      for lbl in true_labels:
              [name,xmin,ymin,xmax,ymax]=list(lbl.values())
              true_lb[name].append([(xmin),(ymin), (xmax),(ymax)])#[cx,cy,iw,ih]
              try:name=name.split(' ')[0]+'-'+name.split(' ')[1]
              except:0
              str_true+=str(name)+' '+str(xmin)+' '+str(ymin)+' '+str(xmax)+' '+str(ymax)+'\n'
      f.write(str_true)
      f.close()

      f=open('input/detection-results/'+str(i)+'.txt','w')
      str_pred=''
      for box in pred_labels: 
              [xmin,ymin,xmax,ymax,conf,_name]=box
              name=LABELS[int(_name)]
              pred_lb[name].append([(xmin),(ymin), (xmax),(ymax),conf])#[cx,cy,iw,ih]
              try:name=name.split(' ')[0]+'-'+name.split(' ')[1]
              except:0
              str_pred+=str(name)+' '+str(conf)+' '+str(xmin)+' '+str(ymin)+' '+str(xmax)+' '+str(ymax)+'\n'
      f.write(str_pred)
      f.close()
           

      for _class in LABELS:
                for true_box in true_lb[_class]:
                    total_true_per_class[_class]+=1
                for pred_box in pred_lb[_class]:
                    _max_iou=0
                    for _ind,true_box in enumerate(true_lb[_class]):
                        _iou= IOU(true_box,pred_box[:4])
                        if _iou>=_max_iou:
                                _max_iou=_iou

                    result_df.loc[ind_loc,['class','iou','True','img','boxx1','boxy1','boxx2','boxy2','conf']]=[_class,_max_iou,1.0,i,pred_box[0],pred_box[1],pred_box[2],pred_box[3],pred_box[4]]
                    if _max_iou>=0.5:
                            result_df.loc[ind_loc,'correct']=1
                    else :
                            result_df.loc[ind_loc,'correct']=0
                    ind_loc+=1
            
       
  image = draw_boxes_1(image, pred_labels, LABELS,true_labels)
  
  if i<100:
      _image=cv2.resize(image,(200,200))
      if i%3==0 :fig,axes=plt.subplots(nrows=1,ncols=3,figsize=(20,5))  
      axes[i%3].imshow(_image[:,:,::-1])
      axes[i%3].set_title(str(i))#+','+str(_ious))
      if i%3==3-1 :plt.show()
  
  0%|                                                                             | 1/20288 [00:02<13:31:55,  2.40s/it]
  0%|                                                                              | 5/20288 [00:03<5:25:23,  1.04it/s]
  0%|                                                                              | 8/20288 [00:04<2:55:40,  1.92it/s]
  0%|                                                                             | 11/20288 [00:05<1:55:46,  2.92it/s]
  0%|                                                                             | 14/20288 [00:05<1:25:14,  3.96it/s]
  0%|                                                                             | 16/20288 [00:06<1:36:30,  3.50it/s]
  0%|                                                                             | 20/20288 [00:07<1:26:05,  3.92it/s]
  0%|                                                                             | 22/20288 [00:08<1:22:39,  4.09it/s]
  0%|                                                                             | 25/20288 [00:08<1:12:49,  4.64it/s]
  0%|                                                                             | 29/20288 [00:11<3:25:38,  1.64it/s]
  0%|                                                                             | 31/20288 [00:11<2:19:19,  2.42it/s]
  0%|▏                                                                            | 35/20288 [00:12<1:50:01,  3.07it/s]
  0%|▏                                                                            | 38/20288 [00:13<1:40:03,  3.37it/s]
  0%|▏                                                                            | 40/20288 [00:14<1:41:06,  3.34it/s]
  0%|▏                                                                            | 44/20288 [00:15<1:13:59,  4.56it/s]
  0%|▏                                                                            | 47/20288 [00:16<1:43:28,  3.26it/s]
  0%|▏                                                                            | 50/20288 [00:17<1:14:03,  4.55it/s]
  0%|▏                                                                            | 53/20288 [00:17<1:08:22,  4.93it/s]
  0%|▏                                                                            | 55/20288 [00:18<1:15:15,  4.48it/s]
  0%|▏                                                                            | 58/20288 [00:18<1:11:07,  4.74it/s]
  0%|▏                                                                            | 62/20288 [00:19<1:17:00,  4.38it/s]
  0%|▏                                                                            | 65/20288 [00:21<1:44:49,  3.22it/s]
  0%|▎                                                                            | 67/20288 [00:22<2:13:13,  2.53it/s]
  0%|▎                                                                            | 70/20288 [00:22<1:45:35,  3.19it/s]
  0%|▎                                                                            | 73/20288 [00:23<1:25:52,  3.92it/s]
  0%|▎                                                                            | 77/20288 [00:24<1:21:26,  4.14it/s]
  0%|▎                                                                            | 80/20288 [00:25<1:13:40,  4.57it/s]
  0%|▎                                                                            | 83/20288 [00:25<1:12:02,  4.67it/s]
  0%|▎                                                                            | 85/20288 [00:26<1:29:46,  3.75it/s]
  0%|▎                                                                            | 89/20288 [00:27<1:32:59,  3.62it/s]
  0%|▎                                                                            | 91/20288 [00:28<1:39:04,  3.40it/s]
  0%|▎                                                                            | 95/20288 [00:29<1:20:58,  4.16it/s]
  0%|▎                                                                            | 97/20288 [00:29<1:31:12,  3.69it/s]
100%|██████████████████████████████████████████████████████████████████████████| 20288/20288 [1:32:42<00:00,  3.65it/s]
In [18]:
#result_df#.loc[:50]
In [19]:
len(pred_dicts)
Out[19]:
80068
In [27]:
f=open('test2017_detections_yolo_results.json','w')
json.dump(pred_dicts,f)
f.close()
In [21]:
!python map.py
77.12% = airplane AP 
5.90% = apple AP 
16.74% = backpack AP 
17.13% = banana AP 
52.30% = baseball-bat AP 
42.74% = baseball-glove AP 
73.52% = bear AP 
63.46% = bed AP 
30.54% = bench AP 
39.20% = bicycle AP 
40.12% = bird AP 
35.44% = boat AP 
17.31% = book AP 
35.99% = bottle AP 
38.06% = bowl AP 
39.76% = broccoli AP 
69.10% = bus AP 
24.13% = cake AP 
37.34% = car AP 
0.14% = carrot AP 
76.51% = cat AP 
34.98% = cell-phone AP 
35.45% = chair AP 
65.04% = clock AP 
47.40% = couch AP 
48.60% = cow AP 
36.47% = cup AP 
31.69% = dining-table AP 
60.76% = dog AP 
23.14% = donut AP 
65.80% = elephant AP 
67.67% = fire-hydrant AP 
30.28% = fork AP 
60.66% = frisbee AP 
64.85% = giraffe AP 
4.55% = hair-drier AP 
14.97% = handbag AP 
57.42% = horse AP 
2.40% = hot-dog AP 
67.74% = keyboard AP 
37.67% = kite AP 
12.05% = knife AP 
63.06% = laptop AP 
56.98% = microwave AP 
49.66% = motorcycle AP 
71.95% = mouse AP 
6.75% = orange AP 
56.57% = oven AP 
57.28% = parking-meter AP 
53.85% = person AP 
8.45% = pizza AP 
39.55% = potted-plant AP 
56.79% = refrigerator AP 
23.65% = remote AP 
3.95% = sandwich AP 
37.42% = scissors AP 
49.75% = sheep AP 
63.93% = sink AP 
67.68% = skateboard AP 
43.04% = skis AP 
42.86% = snowboard AP 
17.74% = spoon AP 
34.50% = sports-ball AP 
62.53% = stop-sign AP 
35.66% = suitcase AP 
56.65% = surfboard AP 
46.61% = teddy-bear AP 
68.83% = tennis-racket AP 
46.42% = tie AP 
5.56% = toaster AP 
73.49% = toilet AP 
18.87% = toothbrush AP 
33.88% = traffic-light AP 
75.39% = train AP 
34.33% = truck AP 
58.47% = tv AP 
47.96% = umbrella AP 
38.78% = vase AP 
30.71% = wine-glass AP 
63.55% = zebra AP 
mAP = 42.57%
Figure(640x1680)

End

In [ ]: